import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
pd.options.mode.chained_assignment = None
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import dalex as dx
import pickle
import ast
import warnings
warnings.filterwarnings('ignore')
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
clf_results_df = pd.read_csv('CV_results.csv', index_col = 0)
used_metrics = ['roc_auc', 'f1', 'accuracy']
clf_results_df.head()
todo
from IPython.display import Image
Image(filename='models.png')
clf_results_df['clusters'] = np.load('clusters.npy')
pd.set_option('display.max_columns', None)
import re
clf_results_df_str = ' '.join(list(clf_results_df.columns))
params = re.findall("param_[a-z]*_?[a-z]*_?[a-z]*", clf_results_df_str)
mean_test = re.findall("mean_test_[a-z1]*_?[a-z]*_?[a-z]*", clf_results_df_str)
params_mean_test = params + mean_test
params_mean_test
Średnie wartości hiperparametrów i wybranych metryk w zależnosci od przynależności modelu do danego klastra
clf_results_df.groupby('clusters').mean()[params_mean_test]
def find_indexes(X, k):
return list(X.loc[X['rank_test_roc_auc'].isin(k),:].index)
indexes_of_best = find_indexes(clf_results_df, [1,2,3,5,21])
best_params = clf_results_df.iloc[pd.Index(indexes_of_best),:]['params']
best_params.reset_index(drop = True, inplace = True)
for params in best_params:
print(params, "\n")
def get_task(path):
with open(path, 'rb') as f:
labels = pickle.load(f)
dct = {'mort':0, 'readmit': 1, 'los': 2, 'dx':3 }
task = [yy[dct['mort']] for yy in labels]
return np.array(task)
#Loading data
X = np.load("./local_mimic/save/X48.npy")
Z = np.load("./local_mimic/save/w2v.npy")
y = get_task("./local_mimic/save/y")
#Data transformations
X, Z, y = np.array(X), np.array(Z), np.array(y)
X = np.append(X, Z, axis=1)
X = pd.DataFrame(X)
### ast.literal_eval - zamiana stringa na dict
### ** po to aby xgboost wczytał podane parametry
xgb_1 = XGBClassifier(**ast.literal_eval(best_params[0]))
xgb_2 = XGBClassifier(**ast.literal_eval(best_params[1]))
xgb_3 = XGBClassifier(**ast.literal_eval(best_params[2]))
xgb_4 = XGBClassifier(**ast.literal_eval(best_params[3]))
xgb_5 = XGBClassifier(**ast.literal_eval(best_params[4]))
# widzimy ze xgboost przyjal paramatery ktore powinnien
print(best_params[4])
xgb_5
models = [xgb_1, xgb_2, xgb_3, xgb_4, xgb_5]
for model in models:
model.fit(X,y)
xgb_1_exp = dx.Explainer(xgb_1,
X, y, label = "XGB 1")
xgb_2_exp = dx.Explainer(xgb_2,
X, y, label = "XGB 2")
xgb_3_exp = dx.Explainer(xgb_3,
X, y, label = "XGB 3")
xgb_4_exp = dx.Explainer(xgb_4,
X, y, label = "XGB 4")
xgb_5_exp = dx.Explainer(xgb_5,
X, y, label = "XGB 5")
vi_xgb_1 = xgb_1_exp.model_parts()
vi_xgb_2 = xgb_2_exp.model_parts()
vi_xgb_3 = xgb_3_exp.model_parts()
vi_xgb_4 = xgb_4_exp.model_parts()
vi_xgb_5 = xgb_5_exp.model_parts()
vi_xgb_1.plot([vi_xgb_2, vi_xgb_3, vi_xgb_4, vi_xgb_5])
def get_k_best_variables(vi_model, k):
return set(vi_model.result.sort_values(by = 'dropout_loss', ascending = False)[1:k].variable)
important_variables = set.intersection(get_k_best_variables(vi_xgb_1, 17),
get_k_best_variables(vi_xgb_2, 17),
get_k_best_variables(vi_xgb_3, 17),
get_k_best_variables(vi_xgb_4, 17),
get_k_best_variables(vi_xgb_5, 17))
important_variables
pd_xgb_1 = xgb_1_exp.model_profile(variables = list(important_variables))
pd_xgb_2 = xgb_2_exp.model_profile(variables = list(important_variables))
pd_xgb_3 = xgb_3_exp.model_profile(variables = list(important_variables))
pd_xgb_4 = xgb_4_exp.model_profile(variables = list(important_variables))
pd_xgb_5 = xgb_5_exp.model_profile(variables = list(important_variables))
pd_xgb_1.plot([pd_xgb_2, pd_xgb_3, pd_xgb_4, pd_xgb_5])
al_xgb_1 = xgb_1_exp.model_profile(variables = list(important_variables), type = 'accumulated')
al_xgb_2 = xgb_2_exp.model_profile(variables = list(important_variables), type = 'accumulated')
al_xgb_3 = xgb_3_exp.model_profile(variables = list(important_variables), type = 'accumulated')
al_xgb_4 = xgb_4_exp.model_profile(variables = list(important_variables), type = 'accumulated')
al_xgb_5 = xgb_5_exp.model_profile(variables = list(important_variables), type = 'accumulated')
al_xgb_1.plot([al_xgb_2, al_xgb_3, al_xgb_4, al_xgb_5])
al_models = [al_xgb_1, al_xgb_2, al_xgb_3, al_xgb_4, al_xgb_5]
pd_models = [pd_xgb_1, pd_xgb_2, pd_xgb_3, pd_xgb_4, pd_xgb_5]
for al_model in al_models:
al_model.result['_label_'] = [x + ' AL profiles' for x in al_model.result['_label_']]
for i in range(5):
al_models[i].plot(pd_models[i])
params = ['nrounds', 'min_child_weight', 'lambda', 'eta', 'colsample_bytree', 'colsample_bylevel', 'alpha']
logs = [False, True, True, True, False, False, True]
def plot_one_param(df, param, metrics, log):
fig, axs = plt.subplots(1, 3, sharey=True, figsize=(8,4))
for i in range(3):
sns.lineplot(ax = axs[i], data = df.groupby('param_'+param).mean(), x='param_'+param, y = 'mean_test_' + metrics[i])
axs[i].set_ylabel('mean test score', fontsize = 14)
axs[i].set_xlabel("")
xlabel = "param_"+param
if log:
axs[i].set(xscale="log")
xlabel = "log(" + xlabel+")"
if i==1:
axs[i].set_xlabel(xlabel, fontsize = 12)
axs[i].set_title(metrics[i])
for param, log in zip(params, logs):
plot_one_param(clf_results_df, param, used_metrics, log)
Requirements: